load("data/samsungData.rda") # load data
dim(samsungData)
## [1] 7352 563
dimdata = dim(samsungData)
dim(samsungData)
## [1] 7352 563
colnames(samsungData)
## [1] "tBodyAcc-mean()-X"
## [2] "tBodyAcc-mean()-Y"
## [3] "tBodyAcc-mean()-Z"
## [4] "tBodyAcc-std()-X"
## [5] "tBodyAcc-std()-Y"
## [6] "tBodyAcc-std()-Z"
## [7] "tBodyAcc-mad()-X"
## [8] "tBodyAcc-mad()-Y"
## [9] "tBodyAcc-mad()-Z"
## [10] "tBodyAcc-max()-X"
## [11] "tBodyAcc-max()-Y"
## [12] "tBodyAcc-max()-Z"
## [13] "tBodyAcc-min()-X"
## [14] "tBodyAcc-min()-Y"
## [15] "tBodyAcc-min()-Z"
## [16] "tBodyAcc-sma()"
## [17] "tBodyAcc-energy()-X"
## [18] "tBodyAcc-energy()-Y"
## [19] "tBodyAcc-energy()-Z"
## [20] "tBodyAcc-iqr()-X"
## [21] "tBodyAcc-iqr()-Y"
## [22] "tBodyAcc-iqr()-Z"
## [23] "tBodyAcc-entropy()-X"
## [24] "tBodyAcc-entropy()-Y"
## [25] "tBodyAcc-entropy()-Z"
## [26] "tBodyAcc-arCoeff()-X,1"
## [27] "tBodyAcc-arCoeff()-X,2"
## [28] "tBodyAcc-arCoeff()-X,3"
## [29] "tBodyAcc-arCoeff()-X,4"
## [30] "tBodyAcc-arCoeff()-Y,1"
## [31] "tBodyAcc-arCoeff()-Y,2"
## [32] "tBodyAcc-arCoeff()-Y,3"
## [33] "tBodyAcc-arCoeff()-Y,4"
## [34] "tBodyAcc-arCoeff()-Z,1"
## [35] "tBodyAcc-arCoeff()-Z,2"
## [36] "tBodyAcc-arCoeff()-Z,3"
## [37] "tBodyAcc-arCoeff()-Z,4"
## [38] "tBodyAcc-correlation()-X,Y"
## [39] "tBodyAcc-correlation()-X,Z"
## [40] "tBodyAcc-correlation()-Y,Z"
## [41] "tGravityAcc-mean()-X"
## [42] "tGravityAcc-mean()-Y"
## [43] "tGravityAcc-mean()-Z"
## [44] "tGravityAcc-std()-X"
## [45] "tGravityAcc-std()-Y"
## [46] "tGravityAcc-std()-Z"
## [47] "tGravityAcc-mad()-X"
## [48] "tGravityAcc-mad()-Y"
## [49] "tGravityAcc-mad()-Z"
## [50] "tGravityAcc-max()-X"
## [51] "tGravityAcc-max()-Y"
## [52] "tGravityAcc-max()-Z"
## [53] "tGravityAcc-min()-X"
## [54] "tGravityAcc-min()-Y"
## [55] "tGravityAcc-min()-Z"
## [56] "tGravityAcc-sma()"
## [57] "tGravityAcc-energy()-X"
## [58] "tGravityAcc-energy()-Y"
## [59] "tGravityAcc-energy()-Z"
## [60] "tGravityAcc-iqr()-X"
## [61] "tGravityAcc-iqr()-Y"
## [62] "tGravityAcc-iqr()-Z"
## [63] "tGravityAcc-entropy()-X"
## [64] "tGravityAcc-entropy()-Y"
## [65] "tGravityAcc-entropy()-Z"
## [66] "tGravityAcc-arCoeff()-X,1"
## [67] "tGravityAcc-arCoeff()-X,2"
## [68] "tGravityAcc-arCoeff()-X,3"
## [69] "tGravityAcc-arCoeff()-X,4"
## [70] "tGravityAcc-arCoeff()-Y,1"
## [71] "tGravityAcc-arCoeff()-Y,2"
## [72] "tGravityAcc-arCoeff()-Y,3"
## [73] "tGravityAcc-arCoeff()-Y,4"
## [74] "tGravityAcc-arCoeff()-Z,1"
## [75] "tGravityAcc-arCoeff()-Z,2"
## [76] "tGravityAcc-arCoeff()-Z,3"
## [77] "tGravityAcc-arCoeff()-Z,4"
## [78] "tGravityAcc-correlation()-X,Y"
## [79] "tGravityAcc-correlation()-X,Z"
## [80] "tGravityAcc-correlation()-Y,Z"
## [81] "tBodyAccJerk-mean()-X"
## [82] "tBodyAccJerk-mean()-Y"
## [83] "tBodyAccJerk-mean()-Z"
## [84] "tBodyAccJerk-std()-X"
## [85] "tBodyAccJerk-std()-Y"
## [86] "tBodyAccJerk-std()-Z"
## [87] "tBodyAccJerk-mad()-X"
## [88] "tBodyAccJerk-mad()-Y"
## [89] "tBodyAccJerk-mad()-Z"
## [90] "tBodyAccJerk-max()-X"
## [91] "tBodyAccJerk-max()-Y"
## [92] "tBodyAccJerk-max()-Z"
## [93] "tBodyAccJerk-min()-X"
## [94] "tBodyAccJerk-min()-Y"
## [95] "tBodyAccJerk-min()-Z"
## [96] "tBodyAccJerk-sma()"
## [97] "tBodyAccJerk-energy()-X"
## [98] "tBodyAccJerk-energy()-Y"
## [99] "tBodyAccJerk-energy()-Z"
## [100] "tBodyAccJerk-iqr()-X"
## [101] "tBodyAccJerk-iqr()-Y"
## [102] "tBodyAccJerk-iqr()-Z"
## [103] "tBodyAccJerk-entropy()-X"
## [104] "tBodyAccJerk-entropy()-Y"
## [105] "tBodyAccJerk-entropy()-Z"
## [106] "tBodyAccJerk-arCoeff()-X,1"
## [107] "tBodyAccJerk-arCoeff()-X,2"
## [108] "tBodyAccJerk-arCoeff()-X,3"
## [109] "tBodyAccJerk-arCoeff()-X,4"
## [110] "tBodyAccJerk-arCoeff()-Y,1"
## [111] "tBodyAccJerk-arCoeff()-Y,2"
## [112] "tBodyAccJerk-arCoeff()-Y,3"
## [113] "tBodyAccJerk-arCoeff()-Y,4"
## [114] "tBodyAccJerk-arCoeff()-Z,1"
## [115] "tBodyAccJerk-arCoeff()-Z,2"
## [116] "tBodyAccJerk-arCoeff()-Z,3"
## [117] "tBodyAccJerk-arCoeff()-Z,4"
## [118] "tBodyAccJerk-correlation()-X,Y"
## [119] "tBodyAccJerk-correlation()-X,Z"
## [120] "tBodyAccJerk-correlation()-Y,Z"
## [121] "tBodyGyro-mean()-X"
## [122] "tBodyGyro-mean()-Y"
## [123] "tBodyGyro-mean()-Z"
## [124] "tBodyGyro-std()-X"
## [125] "tBodyGyro-std()-Y"
## [126] "tBodyGyro-std()-Z"
## [127] "tBodyGyro-mad()-X"
## [128] "tBodyGyro-mad()-Y"
## [129] "tBodyGyro-mad()-Z"
## [130] "tBodyGyro-max()-X"
## [131] "tBodyGyro-max()-Y"
## [132] "tBodyGyro-max()-Z"
## [133] "tBodyGyro-min()-X"
## [134] "tBodyGyro-min()-Y"
## [135] "tBodyGyro-min()-Z"
## [136] "tBodyGyro-sma()"
## [137] "tBodyGyro-energy()-X"
## [138] "tBodyGyro-energy()-Y"
## [139] "tBodyGyro-energy()-Z"
## [140] "tBodyGyro-iqr()-X"
## [141] "tBodyGyro-iqr()-Y"
## [142] "tBodyGyro-iqr()-Z"
## [143] "tBodyGyro-entropy()-X"
## [144] "tBodyGyro-entropy()-Y"
## [145] "tBodyGyro-entropy()-Z"
## [146] "tBodyGyro-arCoeff()-X,1"
## [147] "tBodyGyro-arCoeff()-X,2"
## [148] "tBodyGyro-arCoeff()-X,3"
## [149] "tBodyGyro-arCoeff()-X,4"
## [150] "tBodyGyro-arCoeff()-Y,1"
## [151] "tBodyGyro-arCoeff()-Y,2"
## [152] "tBodyGyro-arCoeff()-Y,3"
## [153] "tBodyGyro-arCoeff()-Y,4"
## [154] "tBodyGyro-arCoeff()-Z,1"
## [155] "tBodyGyro-arCoeff()-Z,2"
## [156] "tBodyGyro-arCoeff()-Z,3"
## [157] "tBodyGyro-arCoeff()-Z,4"
## [158] "tBodyGyro-correlation()-X,Y"
## [159] "tBodyGyro-correlation()-X,Z"
## [160] "tBodyGyro-correlation()-Y,Z"
## [161] "tBodyGyroJerk-mean()-X"
## [162] "tBodyGyroJerk-mean()-Y"
## [163] "tBodyGyroJerk-mean()-Z"
## [164] "tBodyGyroJerk-std()-X"
## [165] "tBodyGyroJerk-std()-Y"
## [166] "tBodyGyroJerk-std()-Z"
## [167] "tBodyGyroJerk-mad()-X"
## [168] "tBodyGyroJerk-mad()-Y"
## [169] "tBodyGyroJerk-mad()-Z"
## [170] "tBodyGyroJerk-max()-X"
## [171] "tBodyGyroJerk-max()-Y"
## [172] "tBodyGyroJerk-max()-Z"
## [173] "tBodyGyroJerk-min()-X"
## [174] "tBodyGyroJerk-min()-Y"
## [175] "tBodyGyroJerk-min()-Z"
## [176] "tBodyGyroJerk-sma()"
## [177] "tBodyGyroJerk-energy()-X"
## [178] "tBodyGyroJerk-energy()-Y"
## [179] "tBodyGyroJerk-energy()-Z"
## [180] "tBodyGyroJerk-iqr()-X"
## [181] "tBodyGyroJerk-iqr()-Y"
## [182] "tBodyGyroJerk-iqr()-Z"
## [183] "tBodyGyroJerk-entropy()-X"
## [184] "tBodyGyroJerk-entropy()-Y"
## [185] "tBodyGyroJerk-entropy()-Z"
## [186] "tBodyGyroJerk-arCoeff()-X,1"
## [187] "tBodyGyroJerk-arCoeff()-X,2"
## [188] "tBodyGyroJerk-arCoeff()-X,3"
## [189] "tBodyGyroJerk-arCoeff()-X,4"
## [190] "tBodyGyroJerk-arCoeff()-Y,1"
## [191] "tBodyGyroJerk-arCoeff()-Y,2"
## [192] "tBodyGyroJerk-arCoeff()-Y,3"
## [193] "tBodyGyroJerk-arCoeff()-Y,4"
## [194] "tBodyGyroJerk-arCoeff()-Z,1"
## [195] "tBodyGyroJerk-arCoeff()-Z,2"
## [196] "tBodyGyroJerk-arCoeff()-Z,3"
## [197] "tBodyGyroJerk-arCoeff()-Z,4"
## [198] "tBodyGyroJerk-correlation()-X,Y"
## [199] "tBodyGyroJerk-correlation()-X,Z"
## [200] "tBodyGyroJerk-correlation()-Y,Z"
## [201] "tBodyAccMag-mean()"
## [202] "tBodyAccMag-std()"
## [203] "tBodyAccMag-mad()"
## [204] "tBodyAccMag-max()"
## [205] "tBodyAccMag-min()"
## [206] "tBodyAccMag-sma()"
## [207] "tBodyAccMag-energy()"
## [208] "tBodyAccMag-iqr()"
## [209] "tBodyAccMag-entropy()"
## [210] "tBodyAccMag-arCoeff()1"
## [211] "tBodyAccMag-arCoeff()2"
## [212] "tBodyAccMag-arCoeff()3"
## [213] "tBodyAccMag-arCoeff()4"
## [214] "tGravityAccMag-mean()"
## [215] "tGravityAccMag-std()"
## [216] "tGravityAccMag-mad()"
## [217] "tGravityAccMag-max()"
## [218] "tGravityAccMag-min()"
## [219] "tGravityAccMag-sma()"
## [220] "tGravityAccMag-energy()"
## [221] "tGravityAccMag-iqr()"
## [222] "tGravityAccMag-entropy()"
## [223] "tGravityAccMag-arCoeff()1"
## [224] "tGravityAccMag-arCoeff()2"
## [225] "tGravityAccMag-arCoeff()3"
## [226] "tGravityAccMag-arCoeff()4"
## [227] "tBodyAccJerkMag-mean()"
## [228] "tBodyAccJerkMag-std()"
## [229] "tBodyAccJerkMag-mad()"
## [230] "tBodyAccJerkMag-max()"
## [231] "tBodyAccJerkMag-min()"
## [232] "tBodyAccJerkMag-sma()"
## [233] "tBodyAccJerkMag-energy()"
## [234] "tBodyAccJerkMag-iqr()"
## [235] "tBodyAccJerkMag-entropy()"
## [236] "tBodyAccJerkMag-arCoeff()1"
## [237] "tBodyAccJerkMag-arCoeff()2"
## [238] "tBodyAccJerkMag-arCoeff()3"
## [239] "tBodyAccJerkMag-arCoeff()4"
## [240] "tBodyGyroMag-mean()"
## [241] "tBodyGyroMag-std()"
## [242] "tBodyGyroMag-mad()"
## [243] "tBodyGyroMag-max()"
## [244] "tBodyGyroMag-min()"
## [245] "tBodyGyroMag-sma()"
## [246] "tBodyGyroMag-energy()"
## [247] "tBodyGyroMag-iqr()"
## [248] "tBodyGyroMag-entropy()"
## [249] "tBodyGyroMag-arCoeff()1"
## [250] "tBodyGyroMag-arCoeff()2"
## [251] "tBodyGyroMag-arCoeff()3"
## [252] "tBodyGyroMag-arCoeff()4"
## [253] "tBodyGyroJerkMag-mean()"
## [254] "tBodyGyroJerkMag-std()"
## [255] "tBodyGyroJerkMag-mad()"
## [256] "tBodyGyroJerkMag-max()"
## [257] "tBodyGyroJerkMag-min()"
## [258] "tBodyGyroJerkMag-sma()"
## [259] "tBodyGyroJerkMag-energy()"
## [260] "tBodyGyroJerkMag-iqr()"
## [261] "tBodyGyroJerkMag-entropy()"
## [262] "tBodyGyroJerkMag-arCoeff()1"
## [263] "tBodyGyroJerkMag-arCoeff()2"
## [264] "tBodyGyroJerkMag-arCoeff()3"
## [265] "tBodyGyroJerkMag-arCoeff()4"
## [266] "fBodyAcc-mean()-X"
## [267] "fBodyAcc-mean()-Y"
## [268] "fBodyAcc-mean()-Z"
## [269] "fBodyAcc-std()-X"
## [270] "fBodyAcc-std()-Y"
## [271] "fBodyAcc-std()-Z"
## [272] "fBodyAcc-mad()-X"
## [273] "fBodyAcc-mad()-Y"
## [274] "fBodyAcc-mad()-Z"
## [275] "fBodyAcc-max()-X"
## [276] "fBodyAcc-max()-Y"
## [277] "fBodyAcc-max()-Z"
## [278] "fBodyAcc-min()-X"
## [279] "fBodyAcc-min()-Y"
## [280] "fBodyAcc-min()-Z"
## [281] "fBodyAcc-sma()"
## [282] "fBodyAcc-energy()-X"
## [283] "fBodyAcc-energy()-Y"
## [284] "fBodyAcc-energy()-Z"
## [285] "fBodyAcc-iqr()-X"
## [286] "fBodyAcc-iqr()-Y"
## [287] "fBodyAcc-iqr()-Z"
## [288] "fBodyAcc-entropy()-X"
## [289] "fBodyAcc-entropy()-Y"
## [290] "fBodyAcc-entropy()-Z"
## [291] "fBodyAcc-maxInds-X"
## [292] "fBodyAcc-maxInds-Y"
## [293] "fBodyAcc-maxInds-Z"
## [294] "fBodyAcc-meanFreq()-X"
## [295] "fBodyAcc-meanFreq()-Y"
## [296] "fBodyAcc-meanFreq()-Z"
## [297] "fBodyAcc-skewness()-X"
## [298] "fBodyAcc-kurtosis()-X"
## [299] "fBodyAcc-skewness()-Y"
## [300] "fBodyAcc-kurtosis()-Y"
## [301] "fBodyAcc-skewness()-Z"
## [302] "fBodyAcc-kurtosis()-Z"
## [303] "fBodyAcc-bandsEnergy()-1,8"
## [304] "fBodyAcc-bandsEnergy()-9,16"
## [305] "fBodyAcc-bandsEnergy()-17,24"
## [306] "fBodyAcc-bandsEnergy()-25,32"
## [307] "fBodyAcc-bandsEnergy()-33,40"
## [308] "fBodyAcc-bandsEnergy()-41,48"
## [309] "fBodyAcc-bandsEnergy()-49,56"
## [310] "fBodyAcc-bandsEnergy()-57,64"
## [311] "fBodyAcc-bandsEnergy()-1,16"
## [312] "fBodyAcc-bandsEnergy()-17,32"
## [313] "fBodyAcc-bandsEnergy()-33,48"
## [314] "fBodyAcc-bandsEnergy()-49,64"
## [315] "fBodyAcc-bandsEnergy()-1,24"
## [316] "fBodyAcc-bandsEnergy()-25,48"
## [317] "fBodyAcc-bandsEnergy()-1,8"
## [318] "fBodyAcc-bandsEnergy()-9,16"
## [319] "fBodyAcc-bandsEnergy()-17,24"
## [320] "fBodyAcc-bandsEnergy()-25,32"
## [321] "fBodyAcc-bandsEnergy()-33,40"
## [322] "fBodyAcc-bandsEnergy()-41,48"
## [323] "fBodyAcc-bandsEnergy()-49,56"
## [324] "fBodyAcc-bandsEnergy()-57,64"
## [325] "fBodyAcc-bandsEnergy()-1,16"
## [326] "fBodyAcc-bandsEnergy()-17,32"
## [327] "fBodyAcc-bandsEnergy()-33,48"
## [328] "fBodyAcc-bandsEnergy()-49,64"
## [329] "fBodyAcc-bandsEnergy()-1,24"
## [330] "fBodyAcc-bandsEnergy()-25,48"
## [331] "fBodyAcc-bandsEnergy()-1,8"
## [332] "fBodyAcc-bandsEnergy()-9,16"
## [333] "fBodyAcc-bandsEnergy()-17,24"
## [334] "fBodyAcc-bandsEnergy()-25,32"
## [335] "fBodyAcc-bandsEnergy()-33,40"
## [336] "fBodyAcc-bandsEnergy()-41,48"
## [337] "fBodyAcc-bandsEnergy()-49,56"
## [338] "fBodyAcc-bandsEnergy()-57,64"
## [339] "fBodyAcc-bandsEnergy()-1,16"
## [340] "fBodyAcc-bandsEnergy()-17,32"
## [341] "fBodyAcc-bandsEnergy()-33,48"
## [342] "fBodyAcc-bandsEnergy()-49,64"
## [343] "fBodyAcc-bandsEnergy()-1,24"
## [344] "fBodyAcc-bandsEnergy()-25,48"
## [345] "fBodyAccJerk-mean()-X"
## [346] "fBodyAccJerk-mean()-Y"
## [347] "fBodyAccJerk-mean()-Z"
## [348] "fBodyAccJerk-std()-X"
## [349] "fBodyAccJerk-std()-Y"
## [350] "fBodyAccJerk-std()-Z"
## [351] "fBodyAccJerk-mad()-X"
## [352] "fBodyAccJerk-mad()-Y"
## [353] "fBodyAccJerk-mad()-Z"
## [354] "fBodyAccJerk-max()-X"
## [355] "fBodyAccJerk-max()-Y"
## [356] "fBodyAccJerk-max()-Z"
## [357] "fBodyAccJerk-min()-X"
## [358] "fBodyAccJerk-min()-Y"
## [359] "fBodyAccJerk-min()-Z"
## [360] "fBodyAccJerk-sma()"
## [361] "fBodyAccJerk-energy()-X"
## [362] "fBodyAccJerk-energy()-Y"
## [363] "fBodyAccJerk-energy()-Z"
## [364] "fBodyAccJerk-iqr()-X"
## [365] "fBodyAccJerk-iqr()-Y"
## [366] "fBodyAccJerk-iqr()-Z"
## [367] "fBodyAccJerk-entropy()-X"
## [368] "fBodyAccJerk-entropy()-Y"
## [369] "fBodyAccJerk-entropy()-Z"
## [370] "fBodyAccJerk-maxInds-X"
## [371] "fBodyAccJerk-maxInds-Y"
## [372] "fBodyAccJerk-maxInds-Z"
## [373] "fBodyAccJerk-meanFreq()-X"
## [374] "fBodyAccJerk-meanFreq()-Y"
## [375] "fBodyAccJerk-meanFreq()-Z"
## [376] "fBodyAccJerk-skewness()-X"
## [377] "fBodyAccJerk-kurtosis()-X"
## [378] "fBodyAccJerk-skewness()-Y"
## [379] "fBodyAccJerk-kurtosis()-Y"
## [380] "fBodyAccJerk-skewness()-Z"
## [381] "fBodyAccJerk-kurtosis()-Z"
## [382] "fBodyAccJerk-bandsEnergy()-1,8"
## [383] "fBodyAccJerk-bandsEnergy()-9,16"
## [384] "fBodyAccJerk-bandsEnergy()-17,24"
## [385] "fBodyAccJerk-bandsEnergy()-25,32"
## [386] "fBodyAccJerk-bandsEnergy()-33,40"
## [387] "fBodyAccJerk-bandsEnergy()-41,48"
## [388] "fBodyAccJerk-bandsEnergy()-49,56"
## [389] "fBodyAccJerk-bandsEnergy()-57,64"
## [390] "fBodyAccJerk-bandsEnergy()-1,16"
## [391] "fBodyAccJerk-bandsEnergy()-17,32"
## [392] "fBodyAccJerk-bandsEnergy()-33,48"
## [393] "fBodyAccJerk-bandsEnergy()-49,64"
## [394] "fBodyAccJerk-bandsEnergy()-1,24"
## [395] "fBodyAccJerk-bandsEnergy()-25,48"
## [396] "fBodyAccJerk-bandsEnergy()-1,8"
## [397] "fBodyAccJerk-bandsEnergy()-9,16"
## [398] "fBodyAccJerk-bandsEnergy()-17,24"
## [399] "fBodyAccJerk-bandsEnergy()-25,32"
## [400] "fBodyAccJerk-bandsEnergy()-33,40"
## [401] "fBodyAccJerk-bandsEnergy()-41,48"
## [402] "fBodyAccJerk-bandsEnergy()-49,56"
## [403] "fBodyAccJerk-bandsEnergy()-57,64"
## [404] "fBodyAccJerk-bandsEnergy()-1,16"
## [405] "fBodyAccJerk-bandsEnergy()-17,32"
## [406] "fBodyAccJerk-bandsEnergy()-33,48"
## [407] "fBodyAccJerk-bandsEnergy()-49,64"
## [408] "fBodyAccJerk-bandsEnergy()-1,24"
## [409] "fBodyAccJerk-bandsEnergy()-25,48"
## [410] "fBodyAccJerk-bandsEnergy()-1,8"
## [411] "fBodyAccJerk-bandsEnergy()-9,16"
## [412] "fBodyAccJerk-bandsEnergy()-17,24"
## [413] "fBodyAccJerk-bandsEnergy()-25,32"
## [414] "fBodyAccJerk-bandsEnergy()-33,40"
## [415] "fBodyAccJerk-bandsEnergy()-41,48"
## [416] "fBodyAccJerk-bandsEnergy()-49,56"
## [417] "fBodyAccJerk-bandsEnergy()-57,64"
## [418] "fBodyAccJerk-bandsEnergy()-1,16"
## [419] "fBodyAccJerk-bandsEnergy()-17,32"
## [420] "fBodyAccJerk-bandsEnergy()-33,48"
## [421] "fBodyAccJerk-bandsEnergy()-49,64"
## [422] "fBodyAccJerk-bandsEnergy()-1,24"
## [423] "fBodyAccJerk-bandsEnergy()-25,48"
## [424] "fBodyGyro-mean()-X"
## [425] "fBodyGyro-mean()-Y"
## [426] "fBodyGyro-mean()-Z"
## [427] "fBodyGyro-std()-X"
## [428] "fBodyGyro-std()-Y"
## [429] "fBodyGyro-std()-Z"
## [430] "fBodyGyro-mad()-X"
## [431] "fBodyGyro-mad()-Y"
## [432] "fBodyGyro-mad()-Z"
## [433] "fBodyGyro-max()-X"
## [434] "fBodyGyro-max()-Y"
## [435] "fBodyGyro-max()-Z"
## [436] "fBodyGyro-min()-X"
## [437] "fBodyGyro-min()-Y"
## [438] "fBodyGyro-min()-Z"
## [439] "fBodyGyro-sma()"
## [440] "fBodyGyro-energy()-X"
## [441] "fBodyGyro-energy()-Y"
## [442] "fBodyGyro-energy()-Z"
## [443] "fBodyGyro-iqr()-X"
## [444] "fBodyGyro-iqr()-Y"
## [445] "fBodyGyro-iqr()-Z"
## [446] "fBodyGyro-entropy()-X"
## [447] "fBodyGyro-entropy()-Y"
## [448] "fBodyGyro-entropy()-Z"
## [449] "fBodyGyro-maxInds-X"
## [450] "fBodyGyro-maxInds-Y"
## [451] "fBodyGyro-maxInds-Z"
## [452] "fBodyGyro-meanFreq()-X"
## [453] "fBodyGyro-meanFreq()-Y"
## [454] "fBodyGyro-meanFreq()-Z"
## [455] "fBodyGyro-skewness()-X"
## [456] "fBodyGyro-kurtosis()-X"
## [457] "fBodyGyro-skewness()-Y"
## [458] "fBodyGyro-kurtosis()-Y"
## [459] "fBodyGyro-skewness()-Z"
## [460] "fBodyGyro-kurtosis()-Z"
## [461] "fBodyGyro-bandsEnergy()-1,8"
## [462] "fBodyGyro-bandsEnergy()-9,16"
## [463] "fBodyGyro-bandsEnergy()-17,24"
## [464] "fBodyGyro-bandsEnergy()-25,32"
## [465] "fBodyGyro-bandsEnergy()-33,40"
## [466] "fBodyGyro-bandsEnergy()-41,48"
## [467] "fBodyGyro-bandsEnergy()-49,56"
## [468] "fBodyGyro-bandsEnergy()-57,64"
## [469] "fBodyGyro-bandsEnergy()-1,16"
## [470] "fBodyGyro-bandsEnergy()-17,32"
## [471] "fBodyGyro-bandsEnergy()-33,48"
## [472] "fBodyGyro-bandsEnergy()-49,64"
## [473] "fBodyGyro-bandsEnergy()-1,24"
## [474] "fBodyGyro-bandsEnergy()-25,48"
## [475] "fBodyGyro-bandsEnergy()-1,8"
## [476] "fBodyGyro-bandsEnergy()-9,16"
## [477] "fBodyGyro-bandsEnergy()-17,24"
## [478] "fBodyGyro-bandsEnergy()-25,32"
## [479] "fBodyGyro-bandsEnergy()-33,40"
## [480] "fBodyGyro-bandsEnergy()-41,48"
## [481] "fBodyGyro-bandsEnergy()-49,56"
## [482] "fBodyGyro-bandsEnergy()-57,64"
## [483] "fBodyGyro-bandsEnergy()-1,16"
## [484] "fBodyGyro-bandsEnergy()-17,32"
## [485] "fBodyGyro-bandsEnergy()-33,48"
## [486] "fBodyGyro-bandsEnergy()-49,64"
## [487] "fBodyGyro-bandsEnergy()-1,24"
## [488] "fBodyGyro-bandsEnergy()-25,48"
## [489] "fBodyGyro-bandsEnergy()-1,8"
## [490] "fBodyGyro-bandsEnergy()-9,16"
## [491] "fBodyGyro-bandsEnergy()-17,24"
## [492] "fBodyGyro-bandsEnergy()-25,32"
## [493] "fBodyGyro-bandsEnergy()-33,40"
## [494] "fBodyGyro-bandsEnergy()-41,48"
## [495] "fBodyGyro-bandsEnergy()-49,56"
## [496] "fBodyGyro-bandsEnergy()-57,64"
## [497] "fBodyGyro-bandsEnergy()-1,16"
## [498] "fBodyGyro-bandsEnergy()-17,32"
## [499] "fBodyGyro-bandsEnergy()-33,48"
## [500] "fBodyGyro-bandsEnergy()-49,64"
## [501] "fBodyGyro-bandsEnergy()-1,24"
## [502] "fBodyGyro-bandsEnergy()-25,48"
## [503] "fBodyAccMag-mean()"
## [504] "fBodyAccMag-std()"
## [505] "fBodyAccMag-mad()"
## [506] "fBodyAccMag-max()"
## [507] "fBodyAccMag-min()"
## [508] "fBodyAccMag-sma()"
## [509] "fBodyAccMag-energy()"
## [510] "fBodyAccMag-iqr()"
## [511] "fBodyAccMag-entropy()"
## [512] "fBodyAccMag-maxInds"
## [513] "fBodyAccMag-meanFreq()"
## [514] "fBodyAccMag-skewness()"
## [515] "fBodyAccMag-kurtosis()"
## [516] "fBodyBodyAccJerkMag-mean()"
## [517] "fBodyBodyAccJerkMag-std()"
## [518] "fBodyBodyAccJerkMag-mad()"
## [519] "fBodyBodyAccJerkMag-max()"
## [520] "fBodyBodyAccJerkMag-min()"
## [521] "fBodyBodyAccJerkMag-sma()"
## [522] "fBodyBodyAccJerkMag-energy()"
## [523] "fBodyBodyAccJerkMag-iqr()"
## [524] "fBodyBodyAccJerkMag-entropy()"
## [525] "fBodyBodyAccJerkMag-maxInds"
## [526] "fBodyBodyAccJerkMag-meanFreq()"
## [527] "fBodyBodyAccJerkMag-skewness()"
## [528] "fBodyBodyAccJerkMag-kurtosis()"
## [529] "fBodyBodyGyroMag-mean()"
## [530] "fBodyBodyGyroMag-std()"
## [531] "fBodyBodyGyroMag-mad()"
## [532] "fBodyBodyGyroMag-max()"
## [533] "fBodyBodyGyroMag-min()"
## [534] "fBodyBodyGyroMag-sma()"
## [535] "fBodyBodyGyroMag-energy()"
## [536] "fBodyBodyGyroMag-iqr()"
## [537] "fBodyBodyGyroMag-entropy()"
## [538] "fBodyBodyGyroMag-maxInds"
## [539] "fBodyBodyGyroMag-meanFreq()"
## [540] "fBodyBodyGyroMag-skewness()"
## [541] "fBodyBodyGyroMag-kurtosis()"
## [542] "fBodyBodyGyroJerkMag-mean()"
## [543] "fBodyBodyGyroJerkMag-std()"
## [544] "fBodyBodyGyroJerkMag-mad()"
## [545] "fBodyBodyGyroJerkMag-max()"
## [546] "fBodyBodyGyroJerkMag-min()"
## [547] "fBodyBodyGyroJerkMag-sma()"
## [548] "fBodyBodyGyroJerkMag-energy()"
## [549] "fBodyBodyGyroJerkMag-iqr()"
## [550] "fBodyBodyGyroJerkMag-entropy()"
## [551] "fBodyBodyGyroJerkMag-maxInds"
## [552] "fBodyBodyGyroJerkMag-meanFreq()"
## [553] "fBodyBodyGyroJerkMag-skewness()"
## [554] "fBodyBodyGyroJerkMag-kurtosis()"
## [555] "angle(tBodyAccMean,gravity)"
## [556] "angle(tBodyAccJerkMean),gravityMean)"
## [557] "angle(tBodyGyroMean,gravityMean)"
## [558] "angle(tBodyGyroJerkMean,gravityMean)"
## [559] "angle(X,gravityMean)"
## [560] "angle(Y,gravityMean)"
## [561] "angle(Z,gravityMean)"
## [562] "subject"
## [563] "activity"
samsungData[,dimdata[2]] = factor(samsungData[,dim(samsungData)[2]])
# check if is factor
is.factor(samsungData[,dimdata[2]])
## [1] TRUE
duplicated_index = which(duplicated(colnames(samsungData)))
duplicated_columns =unique(colnames(samsungData)[(duplicated(colnames(samsungData)))])
samsungData.new = samsungData[,!(duplicated(colnames(samsungData)))]
dim(samsungData.new)
## [1] 7352 479
## adding .index to each duplicated column
for (each in duplicated_columns){
ind = which(colnames(samsungData)==each)
colnames(samsungData)[ind]= unlist(lapply(1:length(ind), function(i) paste(colnames(samsungData)[ind[i]], i, sep=".") ))
}
# view duplicated columns names
colnames(samsungData)[duplicated_index]
## [1] "fBodyAcc-bandsEnergy()-1,8.2"
## [2] "fBodyAcc-bandsEnergy()-9,16.2"
## [3] "fBodyAcc-bandsEnergy()-17,24.2"
## [4] "fBodyAcc-bandsEnergy()-25,32.2"
## [5] "fBodyAcc-bandsEnergy()-33,40.2"
## [6] "fBodyAcc-bandsEnergy()-41,48.2"
## [7] "fBodyAcc-bandsEnergy()-49,56.2"
## [8] "fBodyAcc-bandsEnergy()-57,64.2"
## [9] "fBodyAcc-bandsEnergy()-1,16.2"
## [10] "fBodyAcc-bandsEnergy()-17,32.2"
## [11] "fBodyAcc-bandsEnergy()-33,48.2"
## [12] "fBodyAcc-bandsEnergy()-49,64.2"
## [13] "fBodyAcc-bandsEnergy()-1,24.2"
## [14] "fBodyAcc-bandsEnergy()-25,48.2"
## [15] "fBodyAcc-bandsEnergy()-1,8.3"
## [16] "fBodyAcc-bandsEnergy()-9,16.3"
## [17] "fBodyAcc-bandsEnergy()-17,24.3"
## [18] "fBodyAcc-bandsEnergy()-25,32.3"
## [19] "fBodyAcc-bandsEnergy()-33,40.3"
## [20] "fBodyAcc-bandsEnergy()-41,48.3"
## [21] "fBodyAcc-bandsEnergy()-49,56.3"
## [22] "fBodyAcc-bandsEnergy()-57,64.3"
## [23] "fBodyAcc-bandsEnergy()-1,16.3"
## [24] "fBodyAcc-bandsEnergy()-17,32.3"
## [25] "fBodyAcc-bandsEnergy()-33,48.3"
## [26] "fBodyAcc-bandsEnergy()-49,64.3"
## [27] "fBodyAcc-bandsEnergy()-1,24.3"
## [28] "fBodyAcc-bandsEnergy()-25,48.3"
## [29] "fBodyAccJerk-bandsEnergy()-1,8.2"
## [30] "fBodyAccJerk-bandsEnergy()-9,16.2"
## [31] "fBodyAccJerk-bandsEnergy()-17,24.2"
## [32] "fBodyAccJerk-bandsEnergy()-25,32.2"
## [33] "fBodyAccJerk-bandsEnergy()-33,40.2"
## [34] "fBodyAccJerk-bandsEnergy()-41,48.2"
## [35] "fBodyAccJerk-bandsEnergy()-49,56.2"
## [36] "fBodyAccJerk-bandsEnergy()-57,64.2"
## [37] "fBodyAccJerk-bandsEnergy()-1,16.2"
## [38] "fBodyAccJerk-bandsEnergy()-17,32.2"
## [39] "fBodyAccJerk-bandsEnergy()-33,48.2"
## [40] "fBodyAccJerk-bandsEnergy()-49,64.2"
## [41] "fBodyAccJerk-bandsEnergy()-1,24.2"
## [42] "fBodyAccJerk-bandsEnergy()-25,48.2"
## [43] "fBodyAccJerk-bandsEnergy()-1,8.3"
## [44] "fBodyAccJerk-bandsEnergy()-9,16.3"
## [45] "fBodyAccJerk-bandsEnergy()-17,24.3"
## [46] "fBodyAccJerk-bandsEnergy()-25,32.3"
## [47] "fBodyAccJerk-bandsEnergy()-33,40.3"
## [48] "fBodyAccJerk-bandsEnergy()-41,48.3"
## [49] "fBodyAccJerk-bandsEnergy()-49,56.3"
## [50] "fBodyAccJerk-bandsEnergy()-57,64.3"
## [51] "fBodyAccJerk-bandsEnergy()-1,16.3"
## [52] "fBodyAccJerk-bandsEnergy()-17,32.3"
## [53] "fBodyAccJerk-bandsEnergy()-33,48.3"
## [54] "fBodyAccJerk-bandsEnergy()-49,64.3"
## [55] "fBodyAccJerk-bandsEnergy()-1,24.3"
## [56] "fBodyAccJerk-bandsEnergy()-25,48.3"
## [57] "fBodyGyro-bandsEnergy()-1,8.2"
## [58] "fBodyGyro-bandsEnergy()-9,16.2"
## [59] "fBodyGyro-bandsEnergy()-17,24.2"
## [60] "fBodyGyro-bandsEnergy()-25,32.2"
## [61] "fBodyGyro-bandsEnergy()-33,40.2"
## [62] "fBodyGyro-bandsEnergy()-41,48.2"
## [63] "fBodyGyro-bandsEnergy()-49,56.2"
## [64] "fBodyGyro-bandsEnergy()-57,64.2"
## [65] "fBodyGyro-bandsEnergy()-1,16.2"
## [66] "fBodyGyro-bandsEnergy()-17,32.2"
## [67] "fBodyGyro-bandsEnergy()-33,48.2"
## [68] "fBodyGyro-bandsEnergy()-49,64.2"
## [69] "fBodyGyro-bandsEnergy()-1,24.2"
## [70] "fBodyGyro-bandsEnergy()-25,48.2"
## [71] "fBodyGyro-bandsEnergy()-1,8.3"
## [72] "fBodyGyro-bandsEnergy()-9,16.3"
## [73] "fBodyGyro-bandsEnergy()-17,24.3"
## [74] "fBodyGyro-bandsEnergy()-25,32.3"
## [75] "fBodyGyro-bandsEnergy()-33,40.3"
## [76] "fBodyGyro-bandsEnergy()-41,48.3"
## [77] "fBodyGyro-bandsEnergy()-49,56.3"
## [78] "fBodyGyro-bandsEnergy()-57,64.3"
## [79] "fBodyGyro-bandsEnergy()-1,16.3"
## [80] "fBodyGyro-bandsEnergy()-17,32.3"
## [81] "fBodyGyro-bandsEnergy()-33,48.3"
## [82] "fBodyGyro-bandsEnergy()-49,64.3"
## [83] "fBodyGyro-bandsEnergy()-1,24.3"
## [84] "fBodyGyro-bandsEnergy()-25,48.3"
write(duplicated_index, "duplicated_column_indices.txt", sep="\n")
table(samsungData[, dimdata[2]])
##
## laying sitting standing walk walkdown walkup
## 1407 1286 1374 1226 986 1073
table(samsungData[, dimdata[2]-1])
##
## 1 3 5 6 7 8 11 14 15 16 17 19 21 22 23 25 26 27
## 347 341 302 325 308 281 316 323 328 366 368 360 408 321 372 409 392 376
## 28 29 30
## 382 344 383
barplot(summary(factor(samsungData[, dimdata[2]-1])), cex.names =0.6)
title("number of data points for each subject")
my_df = table(samsungData$subject, samsungData$activity)
par(cex.main = 1)
plot(my_df, main="")
title(main="Distribution of Number of Records of Each Activity For Each Subject", outer = FALSE)
barplot(t(as.matrix(my_df)), beside = TRUE, main = "number of datapoints for each activity per subject", col = c(3,4,6,7,3,1))
legend("topright", colnames(my_df), col = c(3,4,6,7,3,1), lty = 1, lwd = 2, cex = 0.5)
boxplot.matrix(t(my_df), xlab="subject", ylab="count", main="Number of instances per activity for each subject")
lb = c(names(which.max(my_df['1',])),names(which.max(my_df['30',])))
text(c(1,21), c(max(my_df['1',]), max(my_df['30',])), lb, col="red", pos=c(4,3), cex=0.9)
## ggplot2
library(reshape2)
library(ggplot2)
melted = melt(my_df)
colnames(melted)[1:2] = c("subject","activity")
lab = c(names(which.max(my_df["1",])),names(which.max(my_df["15",])),names(which.max(my_df["30",])))
where = c(max(my_df["1",]),max(my_df["15",]), max(my_df["30",]))
ggplot(melted, aes(factor(subject), value))+geom_boxplot()+labs(title = "Number of Instances Per Activity For Each Subject", x="subject", y="count")+geom_text(aes("1",where[1]+2), label=lab[1], color="blue")+geom_text(aes("30",where[3]+2), label=lab[3], color="blue")+geom_text(aes("15",where[2]+2), label=lab[2], color="blue")
my_df.2 = t(apply(my_df, 1, function(x) x/sum(x)))
melted.2 = melt(my_df.2)
colnames(melted.2)[1:2] = c("subject","activity")
lab = c(names(which.max(my_df["1",])),names(which.max(my_df["15",])),names(which.max(my_df["30",])))
where = c(max(my_df["1",]),max(my_df["15",]), max(my_df["30",]))
ggplot(melted.2, aes(factor(subject), value))+geom_boxplot()+labs(x="subject", y="ratio")+scale_y_continuous()
melted.2$subject = factor(melted.2$subject)
summary(aov(value ~ subject, data = melted.2))
## Df Sum Sq Mean Sq F value Pr(>F)
## subject 20 0.0000 0.000000 0 1
## Residuals 105 0.1057 0.001007
my_df_aov = aov(value~factor(subject),data = melted)
summary(my_df_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(subject) 20 4225 211.3 1.602 0.0657 .
## Residuals 105 13849 131.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
my_df_aov.2 = aov(value~factor(activity),data = melted)
summary(my_df_aov.2)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(activity) 5 6632 1326.4 13.91 1.02e-10 ***
## Residuals 120 11443 95.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
last_feature_index = dimdata[2]-2
for (i in seq(1,last_feature_index, 50)) {
if (i+50 < last_feature_index) {
end = i + 50
} else {
end =last_feature_index
}
boxplot(samsungData[,i:end], main=paste("[",i, ",",(i+50), "]"))
}
#### boxplot
boxplot.matrix(my_df)
par(mfrow=c(1,1))
var_per_feature = apply(samsungData[,1:561], 2, var)
barplot(var_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "Variances")
mean_per_feature = apply(samsungData[,1:561], 2, mean)
barplot(mean_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "Means")
cv_per_feature = apply(samsungData[,1:561], 2, function(x) sd(x)/mean(x))
barplot(cv_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "CVs")
abs_cv_per_feature = apply(samsungData[,1:561], 2, function(x) abs(sd(x)/mean(x)))
barplot(abs_cv_per_feature, axisnames = F, cex.names = 0.8, cex.axis = 0.8, xlab = "feature", main = "Absolute CVs")
sorted_variances = var_per_feature[order(var_per_feature, decreasing = T)]
sorted_abs_cvs = abs_cv_per_feature[order(abs_cv_per_feature, decreasing = T)]
# pick top 5
top_5_sorted_variances = sorted_variances[1:5]
top_5_sorted_abs_cvs = sorted_abs_cvs[1:5]
names_top_5_sorted_variances = names(top_5_sorted_variances)
names_top_5_sorted_abs_cvs = names(top_5_sorted_abs_cvs )
top_5_sorted_variances
## fBodyAccJerk-entropy()-X fBodyAccJerk-entropy()-Y
## 0.5651023 0.5425037
## tBodyAccJerkMag-entropy() fBodyAcc-entropy()-X
## 0.5289138 0.5257172
## tGravityAcc-correlation()-X,Z
## 0.5008733
top_5_sorted_abs_cvs
## tBodyGyro-correlation()-X,Z tBodyGyroJerk-entropy()-Y
## 390.41424 362.43103
## angle(tBodyAccJerkMean),gravityMean) tGravityAcc-max()-Y
## 205.07158 109.82800
## angle(tBodyGyroJerkMean,gravityMean)
## 79.91638
split_data = function (data, seed) {
set.seed(seed) # set seeed for reproducibility
subject_list = attributes(factor(data[,dim(data)[2]-1]))$levels
training_ratio = 0.8
n_train = floor(training_ratio*length(subject_list))
trainning_subjects = sample(subject_list, n_train, replace = FALSE)
trainning_indices <<- which( data[,"subject"]%in%trainning_subjects)
test_subjects = subject_list[!(subject_list%in%trainning_indices)]
test_indices <<- which(!data[,"subject"]%in%trainning_subjects)
write.table(data, file = "samsungData_fixed-duplicated-columns.csv", row.names = FALSE, col.names = TRUE, sep = "," )
write.table(data[trainning_indices,], file = "samsungData_fixed-duplicated-columns.train.csv", row.names = FALSE, col.names = TRUE, sep = "," )
write.table(data[test_indices,], file = "samsungData_fixed-duplicated-columns.test.csv", row.names = FALSE, col.names = TRUE, sep = "," )
}
split_data(data = samsungData, seed = 123)
# The following two commands remove any previously installed H2O packages for R.
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
# Next, we download packages that H2O depends on.
if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
if (! ("graphics" %in% rownames(installed.packages()))) { install.packages("graphics") }
if (! ("RCurl" %in% rownames(installed.packages()))) { install.packages("RCurl") }
if (! ("jsonlite" %in% rownames(installed.packages()))) { install.packages("jsonlite") }
if (! ("tools" %in% rownames(installed.packages()))) { install.packages("tools") }
if (! ("utils" %in% rownames(installed.packages()))) { install.packages("utils") }
# h2o_3.8.2.6
install.packages("h2o", type="source", repos=(c("https://h2o-release.s3.amazonaws.com/h2o/rel-turchin/6/R")))
## load modules and start h2o compute node
library(h2o)
## Loading required package: statmod
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
localH2O = h2o.init(ip = "localhost", startH2O = TRUE)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 16 hours 41 minutes
## H2O cluster version: 3.8.2.6
## H2O cluster name: H2O_started_from_R_tkhunkhe_gbi986
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.21 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 2
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## R Version: R version 3.3.0 (2016-05-03)
# upload file to h2o
samsungData.hex = h2o.uploadFile(path = "samsungData_fixed-duplicated-columns.train.csv")
dim(samsungData.hex)
ncol = dim(samsungData.hex)[2]
x = colnames(samsungData.hex)[-((ncol-1):ncol)]
y = colnames(samsungData.hex)[ncol]
# classification with random forest, and get the top most important features used
list_models = read.table("model_id.txt", sep="\t", header = FALSE,col.names = c('model', 'model_id'), stringsAsFactors = FALSE)
rownames(list_models) = list_models[,1]
list_models = list_models[-1]
tryCatch ( {
model <<- h2o.getModel(list_models['full model',])}, # <<- save to global
error=function(e) {
model <<- h2o.randomForest(x, y, seed = 123, samsungData.hex, nfolds = 10)
write(paste("full model",model@model_id, sep="\t"), "model_id.txt", append = FALSE)} )
model_no_cross_validate = h2o.randomForest(x, y, seed = 123, samsungData.hex)
##
|
| | 0%
|
|=== | 4%
|
|====== | 10%
|
|========= | 14%
|
|============= | 20%
|
|================ | 24%
|
|==================== | 30%
|
|======================= | 36%
|
|========================== | 40%
|
|============================== | 46%
|
|================================ | 50%
|
|==================================== | 56%
|
|======================================= | 60%
|
|=========================================== | 66%
|
|=============================================== | 72%
|
|================================================= | 76%
|
|===================================================== | 82%
|
|======================================================== | 86%
|
|============================================================ | 92%
|
|============================================================== | 96%
|
|=================================================================| 100%
vars_from_cv = h2o.varimp(model)[1:5,'variable']
vars_from_no_cv = h2o.varimp(model_no_cross_validate)[1:5,'variable']
identical(vars_from_cv, vars_from_no_cv) # true
## [1] TRUE
model_no_cross_validate @model$run_time # 18 sec
## [1] 20931
model@model$run_time # 25 sec
## [1] 30076
h2o.confusionMatrix(model)
## Confusion Matrix: vertical: actual; across: predicted
## laying sitting standing walk walkdown walkup Error Rate
## laying 1092 0 0 0 0 0 0.0000 = 0 / 1,092
## sitting 0 955 59 0 0 0 0.0582 = 59 / 1,014
## standing 0 21 1063 0 0 0 0.0194 = 21 / 1,084
## walk 0 1 2 942 5 5 0.0136 = 13 / 955
## walkdown 0 0 0 4 753 7 0.0144 = 11 / 764
## walkup 0 0 0 1 1 824 0.0024 = 2 / 826
## Totals 1092 977 1124 947 759 836 0.0185 = 106 / 5,735
1 - h2o.confusionMatrix(model)['Totals','Error']
## [1] 0.981517
baseline_accuracy_cv = model@model$cross_validation_metrics_summary[,c('mean', 'sd')]
baseline_accuracy = baseline_accuracy_cv['accuracy',]
compared_top_imp_vars =matrix(nrow=5, ncol=0)
for (m in h2o.cross_validation_models(model)){
compared_top_imp_vars = cbind(compared_top_imp_vars, h2o.varimp(m)$variable[1:5])
}
compared_top_imp_vars = cbind(compared_top_imp_vars,h2o.varimp(model)$variable[1:5])
colnames(compared_top_imp_vars) = c(1:10, "selected")
rownames(compared_top_imp_vars) = paste("var",c(1:5), sep="")
t(as.data.frame(compared_top_imp_vars))
## var1 var2
## 1 "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"
## 2 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 3 "tGravityAcc-mean()-Y" "tGravityAcc-energy()-X"
## 4 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 5 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 6 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 7 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 8 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 9 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 10 "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"
## selected "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"
## var3 var4
## 1 "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## 2 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 3 "angle(X,gravityMean)" "tGravityAcc-min()-X"
## 4 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 5 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 6 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 7 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 8 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 9 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 10 "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## selected "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## var5
## 1 "tGravityAcc-min()-Z"
## 2 "tGravityAcc-max()-Y"
## 3 "tGravityAcc-min()-Z"
## 4 "angle(Y,gravityMean)"
## 5 "tGravityAcc-max()-Y"
## 6 "tGravityAcc-min()-Z"
## 7 "angle(Y,gravityMean)"
## 8 "tGravityAcc-max()-Y"
## 9 "tGravityAcc-max()-Y"
## 10 "angle(Y,gravityMean)"
## selected "tGravityAcc-max()-Y"
# select top features
top5_important_feature = model@model$variable_importances$variable[1:5]
top4_important_feature = model@model$variable_importances$variable[1:4]
top3_important_feature = model@model$variable_importances$variable[1:3]
top2_important_feature = model@model$variable_importances$variable[1:2]
top1_important_feature = model@model$variable_importances$variable[1]
### Train models, starting from using top 3 importand features until 80% accuracy is acchieved
tryCatch( {
model.with.1.features<<- h2o.getModel(list_models['1-featured model',])
model.with.2.features<<- h2o.getModel(list_models['2-featured model',])
model.with.3.features<<- h2o.getModel(list_models['3-featured model',])
model.with.4.features<<- h2o.getModel(list_models['4-featured model',])
model.with.5.features<<- h2o.getModel(list_models['5-featured model',])
}, error = function(e) {
model.with.1.features <<- h2o.randomForest(top1_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.2.features <<- h2o.randomForest(top2_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.3.features <<- h2o.randomForest(top3_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.4.features <<- h2o.randomForest(top4_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.5.features <<- h2o.randomForest(top5_important_feature, y, seed = 123, samsungData.hex, nfolds = 10)
write(paste("1-featured model",model.with.1.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("2-featured model",model.with.2.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("3-featured model",model.with.3.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("4-featured model",model.with.4.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("5-featured model",model.with.5.features@model_id, sep="\t"), "model_id.txt", append = TRUE)
})
compared_top_imp_vars =matrix(nrow=5, ncol=0)
for (m in h2o.cross_validation_models(model)){
compared_top_imp_vars = cbind(compared_top_imp_vars, h2o.varimp(m)$variable[1:5])
}
compared_top_imp_vars = cbind(compared_top_imp_vars,h2o.varimp(model)$variable[1:5])
colnames(compared_top_imp_vars) = c(1:10, "selected")
rownames(compared_top_imp_vars) = paste("var",c(1:5), sep="")
t(as.data.frame(compared_top_imp_vars))
## var1 var2
## 1 "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"
## 2 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 3 "tGravityAcc-mean()-Y" "tGravityAcc-energy()-X"
## 4 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 5 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 6 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 7 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 8 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 9 "tGravityAcc-mean()-Y" "angle(X,gravityMean)"
## 10 "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"
## selected "tGravityAcc-mean()-Y" "tGravityAcc-min()-X"
## var3 var4
## 1 "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## 2 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 3 "angle(X,gravityMean)" "tGravityAcc-min()-X"
## 4 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 5 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 6 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 7 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 8 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 9 "tGravityAcc-min()-X" "tGravityAcc-energy()-X"
## 10 "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## selected "angle(X,gravityMean)" "tGravityAcc-energy()-X"
## var5
## 1 "tGravityAcc-min()-Z"
## 2 "tGravityAcc-max()-Y"
## 3 "tGravityAcc-min()-Z"
## 4 "angle(Y,gravityMean)"
## 5 "tGravityAcc-max()-Y"
## 6 "tGravityAcc-min()-Z"
## 7 "angle(Y,gravityMean)"
## 8 "tGravityAcc-max()-Y"
## 9 "tGravityAcc-max()-Y"
## 10 "angle(Y,gravityMean)"
## selected "tGravityAcc-max()-Y"
# select top features
top5_important_feature_filter_variance = names_top_5_sorted_variances
top5_important_feature_filter_abs_cv = names_top_5_sorted_abs_cvs
top4_important_feature_filter_variance = names_top_5_sorted_variances[1:4]
top4_important_feature_filter_abs_cv = names_top_5_sorted_abs_cvs[1:4]
top3_important_feature_filter_variance = names_top_5_sorted_variances[1:3]
top3_important_feature_filter_abs_cv = names_top_5_sorted_abs_cvs[1:3]
y = colnames(samsungData.hex)[ncol]
tryCatch( {
# model.with.1.features<<- h2o.getModel(list_models['1-featured model',])
# model.with.2.features<<- h2o.getModel(list_models['2-featured model',])
# model.with.3.features<<- h2o.getModel(list_models['3-featured model',])
# model.with.4.features<<- h2o.getModel(list_models['4-featured model',])
model.with.5.features_filter_variance <<- h2o.getModel(list_models['5-featured model_filter_variance',])
model.with.5.features_filter_abs_cv <<- h2o.getModel(list_models['5-featured model_filter_cv',])
model.with.4.features_filter_variance <<- h2o.getModel(list_models['4-featured model_filter_variance',])
model.with.4.features_filter_abs_cv <<- h2o.getModel(list_models['4-featured model_filter_cv',])
model.with.3.features_filter_variance <<- h2o.getModel(list_models['3-featured model_filter_variance',])
model.with.3.features_filter_abs_cv <<- h2o.getModel(list_models['3-featured model_filter_cv',])
}, error = function(e) {
model.with.5.features_filter_variance <<- h2o.randomForest(top5_important_feature_filter_variance, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.5.features_filter_abs_cv <<- h2o.randomForest(top5_important_feature_filter_abs_cv, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.4.features_filter_variance <<- h2o.randomForest(top4_important_feature_filter_variance, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.4.features_filter_abs_cv <<- h2o.randomForest(top4_important_feature_filter_abs_cv, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.3.features_filter_variance <<- h2o.randomForest(top3_important_feature_filter_variance, y, seed = 123, samsungData.hex, nfolds = 10)
model.with.3.features_filter_abs_cv <<- h2o.randomForest(top3_important_feature_filter_abs_cv, y, seed = 123, samsungData.hex, nfolds = 10)
write(paste("5-featured model_filter_variance",model.with.5.features_filter_variance@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("5-featured model_filter_cv",model.with.5.features_filter_abs_cv@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("4-featured model_filter_variance",model.with.4.features_filter_variance@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("4-featured model_filter_cv",model.with.4.features_filter_abs_cv@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("3-featured model_filter_variance",model.with.3.features_filter_variance@model_id, sep="\t"), "model_id.txt", append = TRUE)
write(paste("3-featured model_filter_cv",model.with.3.features_filter_abs_cv@model_id, sep="\t"), "model_id.txt", append = TRUE)
})
accuracy.randForest = data.frame(0,0)
colnames(accuracy.randForest) = c('mean','sd')
model_list = list( model.with.1.features,model.with.2.features,model.with.3.features,model.with.4.features,model.with.5.features)
for (i in 1:length(model_list)){
accuracy.randForest[i,] = as.numeric(model_list[[i]]@model$cross_validation_metrics_summary['accuracy',c('mean','sd')])
}
accuracy.randForest['all',] = as.numeric( baseline_accuracy)
accuracy.randForest
## mean sd
## 1 0.4983688 0.012374596
## 2 0.7921445 0.013212611
## 3 0.8808801 0.008699819
## 4 0.8851942 0.006473109
## 5 0.8941538 0.009877398
## all 0.9824840 0.003290585
# ggplot2
x = rownames(accuracy.randForest)
y = accuracy.randForest[,1]
sd = accuracy.randForest[,2]
h = 0.8
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Accuracies of Models with Different Numbers of Features Used", x="number of features used", y="accuracy")+geom_hline(yintercept = h,lty = "dashed",show.legend = TRUE )+geom_text(aes("all",h,label = "threshold = 0.8", vjust = -1))
accuracy = data.frame(0,0)
colnames(accuracy) = c('mean','sd')
model_list_var = list( model.with.3.features_filter_variance,model.with.4.features_filter_variance,model.with.5.features_filter_variance)
for (i in 1:length(model_list_var)){
accuracy[i,] = as.numeric(model_list_var[[i]]@model$cross_validation_metrics_summary['accuracy',c('mean','sd')])
}
accuracy['all',] = as.numeric( baseline_accuracy)
rownames(accuracy)[1:3] = 3:5
accuracy
## mean sd
## 3 0.4952847 0.009646328
## 4 0.5765822 0.016515600
## 5 0.6213171 0.008199125
## all 0.9824840 0.003290585
# ggplot2
x = rownames(accuracy)
y = accuracy[,1]
sd = accuracy[,2]
h = 0.8
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Accuracies of Models with Different Numbers of Most-variant Features Used", x="number of features used", y="accuracy")+geom_hline(yintercept = h,lty = "dashed",show.legend = TRUE )+geom_text(aes("all",h,label = "threshold = 0.8", vjust = -1))
accuracy = data.frame(0,0)
colnames(accuracy) = c('mean','sd')
model_list_abs_cv = list( model.with.3.features_filter_abs_cv,model.with.4.features_filter_abs_cv,model.with.5.features_filter_abs_cv)
for (i in 1:length(model_list_abs_cv)){
accuracy[i,] = as.numeric(model_list_abs_cv[[i]]@model$cross_validation_metrics_summary['accuracy',c('mean','sd')])
}
accuracy['all',] = as.numeric( baseline_accuracy)
rownames(accuracy)[1:3] = 3:5
accuracy
## mean sd
## 3 0.4630966 0.010582850
## 4 0.7972229 0.009651011
## 5 0.7953448 0.009243531
## all 0.9824840 0.003290585
# ggplot2
x = rownames(accuracy)
y = accuracy[,1]
sd = accuracy[,2]
h = 0.8
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Accuracies of Models with Different Numbers of Highest-Abs-CV Features Used", x="number of features used", y="accuracy")+geom_hline(yintercept = h,lty = "dashed",show.legend = TRUE )+geom_text(aes("all",h,label = "threshold = 0.8", vjust = -1))
acc = accuracy.randForest[,"mean" ]
for (i in 1:length(acc)){
this.accuracy = acc[i]
if (this.accuracy >= 0.8) {
num_features_selected <<- i
break
}
}
num_features_selected
## [1] 3
selected_model = model_list[[num_features_selected]]
selected_features = top5_important_feature[1:num_features_selected]
h2o.confusionMatrix(selected_model)
## Confusion Matrix: vertical: actual; across: predicted
## laying sitting standing walk walkdown walkup Error Rate
## laying 1092 0 0 0 0 0 0.0000 = 0 / 1,092
## sitting 0 961 22 10 14 7 0.0523 = 53 / 1,014
## standing 0 29 946 51 30 28 0.1273 = 138 / 1,084
## walk 0 14 58 808 50 25 0.1539 = 147 / 955
## walkdown 0 12 21 77 554 100 0.2749 = 210 / 764
## walkup 0 8 8 26 69 715 0.1344 = 111 / 826
## Totals 1092 1024 1055 972 717 875 0.1149 = 659 / 5,735
selected_model@model$cross_validation_metrics_summary[,c('mean', 'sd')]
## mean sd
## accuracy 0.8808801 0.008699819
## err 0.11911989 0.008699819
## err_count 68.2 4.886717
## logloss 0.40504476 0.039050993
## max_per_class_error 0.28640023 0.04486839
## mse 0.107486196 0.003819107
## r2 0.96210855 0.0016113676
selected_model@model$model_summary
## Model Summary:
## number_of_trees model_size_in_bytes min_depth max_depth mean_depth
## 1 300 1070818 1 20 16.84667
## min_leaves max_leaves mean_leaves
## 1 2 515 303.62000
selected_model
## Model Details:
## ==============
##
## H2OMultinomialModel: drf
## Model ID: DRF_model_R_1475468380215_5
## Model Summary:
## number_of_trees model_size_in_bytes min_depth max_depth mean_depth
## 1 300 1070818 1 20 16.84667
## min_leaves max_leaves mean_leaves
## 1 2 515 303.62000
##
##
## H2OMultinomialMetrics: drf
## ** Reported on training data. **
## Description: Metrics reported on Out-Of-Bag training samples
##
## Training Set Metrics:
## =====================
## Metrics reported on Out-Of-Bag training samples
##
## Extract training frame with `h2o.getFrame("samsungData_fixed-duplicated-columns_sid_b07f_2")`
## MSE: (Extract with `h2o.mse`) 0.1072876
## R^2: (Extract with `h2o.r2`) 0.9622172
## Logloss: (Extract with `h2o.logloss`) 0.598621
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
## laying sitting standing walk walkdown walkup Error Rate
## laying 1092 0 0 0 0 0 0.0000 = 0 / 1,092
## sitting 0 961 22 10 14 7 0.0523 = 53 / 1,014
## standing 0 29 946 51 30 28 0.1273 = 138 / 1,084
## walk 0 14 58 808 50 25 0.1539 = 147 / 955
## walkdown 0 12 21 77 554 100 0.2749 = 210 / 764
## walkup 0 8 8 26 69 715 0.1344 = 111 / 826
## Totals 1092 1024 1055 972 717 875 0.1149 = 659 / 5,735
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.885092
## 2 2 0.963208
## 3 3 0.983261
## 4 4 0.989364
## 5 5 0.990235
## 6 6 1.000000
##
##
##
## H2OMultinomialMetrics: drf
## ** Reported on cross-validation data. **
## Description: 10-fold cross-validation on training data (Metrics computed for combined holdout predictions)
##
## Cross-Validation Set Metrics:
## =====================
## 10-fold cross-validation on training data (Metrics computed for combined holdout predictions)
##
## Extract cross-validation frame with `h2o.getFrame("samsungData_fixed-duplicated-columns_sid_b07f_2")`
## MSE: (Extract with `h2o.mse`) 0.1074331
## R^2: (Extract with `h2o.r2`) 0.962166
## Logloss: (Extract with `h2o.logloss`) 0.40561
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.881081
## 2 2 0.966173
## 3 3 0.989364
## 4 4 0.996338
## 5 5 0.997733
## 6 6 1.000000
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## accuracy 0.8808801 0.008699819 0.8962433 0.8752026
## err 0.11911989 0.008699819 0.10375671 0.124797404
## err_count 68.2 4.886717 58 77
## logloss 0.40504476 0.039050993 0.43640986 0.500366
## max_per_class_error 0.28640023 0.04486839 0.28169015 0.3783784
## mse 0.107486196 0.003819107 0.09968863 0.110442854
## r2 0.96210855 0.0016113676 0.96564025 0.9613993
## cv_3_valid cv_4_valid cv_5_valid cv_6_valid
## accuracy 0.88264465 0.90189326 0.8635514 0.8881789
## err 0.11735537 0.09810671 0.13644859 0.111821085
## err_count 71 57 73 70
## logloss 0.4445967 0.3741133 0.48623514 0.35710683
## max_per_class_error 0.2027027 0.18666667 0.32894737 0.23376623
## mse 0.10546697 0.09904204 0.11496392 0.106322244
## r2 0.9623406 0.96377623 0.9603755 0.9606866
## cv_7_valid cv_8_valid cv_9_valid cv_10_valid
## accuracy 0.88928574 0.87523276 0.8645833 0.87198514
## err 0.11071429 0.12476723 0.13541667 0.12801485
## err_count 62 67 78 69
## logloss 0.39912796 0.34781325 0.36107048 0.34360805
## max_per_class_error 0.2875 0.37704918 0.26262626 0.32467532
## mse 0.10474819 0.10700288 0.115492225 0.111692
## r2 0.9651872 0.96346056 0.9581606 0.9600589
# y = colnames(samsungData.hex)[ncol]
# is.factor(samsungData.hex[ncol])
# #install.packages("ade4")
# library(ade4)
# y.array = acm.disjonctif(samsungData[ncol])
# colnames(y.array) = names(summary(factor(samsungData[,ncol])))
# samsungData.glm = samsungData
# samsungData.glm = samsungData.glm[,-563]
# samsungData.glm = cbind(samsungData.glm , y.array)
# write.table(samsungData.glm, file = "samsungData.glm.csv", row.names = FALSE, col.names = TRUE, sep = "," )
#
# write.table(samsungData.glm[trainning_indices,], file = "samsungData.glm.train.csv", row.names = FALSE, col.names = TRUE, sep = "," )
#
# write.table(samsungData.glm[test_indices,], file = "samsungData.glm.test.csv", row.names = FALSE, col.names = TRUE, sep = "," )
#
# samsungData.hex.glm = h2o.uploadFile("samsungData.glm.train.csv")
#
# ### how to do glm -- > multi label !!
# glm.model.train = h2o.glm(x=top3_important_feature, y = colnames(y.array), training_frame = samsungData.hex.glm, family = "binomial")
#
# library(nnet)
# top3_important_feature
# ind = match(top3_important_feature, colnames(samsungData))
# indexed_samsumData = samsungData
# colnames(indexed_samsumData) = make.names(colnames(samsungData), unique = TRUE)
# top3_important_feature_glm = colnames(indexed_samsumData) [ind]
# train.glm = multinom(activity ~ tGravityAcc.mean...Y + tGravityAcc.min...X +angle.X.gravityMean.,data=indexed_samsumData[trainning_indices,])
#
# z <- summary(train.glm)$coefficients/summary(train.glm)$standard.errors
# z
# #2-tailed z test
# p <- (1 - pnorm(abs(z), 0, 1))*2
# p # how does this work?
# exp(coef(train.glm))
# head(pp <- fitted(train.glm))
#
# d.samsungData.train = indexed_samsumData[trainning_indices,c(42,53,559,563)]
#
# train.predicted.glm = predict(train.glm, newdata = d.samsungData.train )
#
# ## confusion matrix
# table(train.predicted.glm, indexed_samsumData[trainning_indices,563])
#
# # accuracy
# sum(train.predicted.glm==indexed_samsumData[trainning_indices,563])/length(train.predicted.glm)
#
#
# ############## don't touch this
#
# d.samsungData.test = indexed_samsumData[test_indices,c(42,53,559,563)]
#
# test.predicted.glm = predict(train.glm, newdata = d.samsungData.test )
samsungData.test.hex = h2o.uploadFile(path = "samsungData_fixed-duplicated-columns.test.csv")
##
|
| | 0%
|
|=================================================================| 100%
dim(samsungData.test.hex)
## [1] 1617 563
# ncol = dim(samsungData.test.hex)[2]
# x =selected_features
# y = colnames(samsungData.test.hex)[ncol]
#
# # classification of the test data with random forest
# tryCatch( {
# model.test<<- h2o.getModel(list_models['test_model',])
# }, error = function(e) {
# model.test <<- h2o.randomForest(x, y, seed = 123, samsungData.test.hex)
# write(paste("test_model",model.test@model_id, sep="\t"), "model_id.txt", append = TRUE)
# })
# prediction time
ptm <- proc.time() # start timer
test.predicted = h2o.predict(model.with.3.features, newdata = samsungData.test.hex)
##
|
| | 0%
|
|=================================================================| 100%
t.reduced = proc.time() - ptm # elapsed is the 'real' time
t.reduced
## user system elapsed
## 0.056 0.005 1.107
#summary(test.predicted)
# accuracy
test.accuracy = sum(test.predicted[,'predict']==samsungData.test.hex[,ncol])/dim(samsungData.test.hex[,ncol])[1]
test.accuracy
## [1] 0.6672851
# confusion matrix
table(as.matrix(test.predicted[,'predict']),as.matrix(samsungData.test.hex[,ncol]))# need as.marix because h2o's result is environment and can't be put in table
##
## laying sitting standing walk walkdown walkup
## laying 315 0 0 0 0 0
## sitting 0 229 51 21 23 0
## standing 0 39 175 84 4 33
## walk 0 1 48 109 20 43
## walkdown 0 3 12 50 99 19
## walkup 0 0 4 7 76 152
ptm <- proc.time() # start timer
test.predicted.baseline = h2o.predict(model, newdata = samsungData.test.hex)
##
|
| | 0%
|
|=================================================================| 100%
t.baseline = proc.time() - ptm # elapsed is the 'real' time
t.baseline
## user system elapsed
## 0.093 0.008 1.168
#summary(test.predicted.baseline)
# accuracy
test.accuracy.baseline = sum(test.predicted.baseline[,'predict']==samsungData.test.hex[,ncol])/dim(samsungData.test.hex[,ncol])[1]
test.accuracy.baseline
## [1] 0.9029066
# confusion matrix
table(as.matrix(test.predicted.baseline[,'predict']),as.matrix(samsungData.test.hex[,ncol]))# need as.marix because h2o's result is environment and can't be put in table
##
## laying sitting standing walk walkdown walkup
## laying 315 0 3 0 0 3
## sitting 0 247 26 0 0 0
## standing 0 24 261 0 0 0
## walk 0 0 0 260 5 7
## walkdown 0 0 0 11 216 76
## walkup 0 1 0 0 1 161
test.accuracytest.accuracy.baselineaccuracy.randForest[3,]accuracy.randForest[3,]test.accuracy
## [1] 0.6672851
test.accuracy.baseline
## [1] 0.9029066
accuracy.randForest[3,]
## mean sd
## 3 0.8808801 0.008699819
baseline_accuracy
## mean sd
## accuracy 0.98248404 0.003290585
x = c(3, "all", 3,"all")
y = as.numeric(c(test.accuracy, test.accuracy.baseline,accuracy.randForest[3,]$mean,baseline_accuracy$mean))
sd = as.numeric(c(0,0,accuracy.randForest[3,]$sd,baseline_accuracy$sd))
group = as.factor(c("test", "test", "train", "train"))
test.accuracy.df = data.frame(x=x,y=y,group=group)
qplot(test.accuracy.df$x,test.accuracy.df$y, color=test.accuracy.df$group, shape=test.accuracy.df$group)+labs( x="number of features used", y="Accuracy")+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+ scale_colour_discrete(guide=F)+geom_hline(yintercept = h,lty = "dashed",show.legend = TRUE )+geom_text(aes("all",h,label = "threshold = 0.8", vjust = -1))+scale_shape_discrete(name = "dataset")
percent_reduction_reduced = c((test.accuracy -as.numeric(accuracy.randForest[3,]$mean) )*100/as.numeric(accuracy.randForest[3,]$mean),as.numeric(accuracy.randForest[3,]$sd))
names(percent_reduction_reduced ) = c("mean", "sd")
percent_reduction_full = c((test.accuracy.baseline - as.numeric( baseline_accuracy$mean))*100/as.numeric( baseline_accuracy$mean), as.numeric( baseline_accuracy$sd))
names(percent_reduction_full ) = c("mean", "sd")
percent_diff_test = (test.accuracy - test.accuracy.baseline)*100/ test.accuracy.baseline
percent_diff_train = (as.numeric(accuracy.randForest[3,]$mean) - as.numeric(baseline_accuracy$mean))*100/as.numeric(baseline_accuracy$mean)
percent_reduction_reduced
## mean sd
## -24.247908897 0.008699819
percent_reduction_full
## mean sd
## -8.099614810 0.003290585
percent_diff_train
## [1] -10.34154
percent_diff_test
## [1] -26.09589
# h2o.confusionMatrix(model.test)
# test.accuracy = 1 - h2o.confusionMatrix(model.test)['Totals', 'Error']
# test.accuracy
results = data.frame()
for (i in 1:length(model_list)){
m = model_list[[i]]
for (this.m in h2o.cross_validation_models(m)){
results = rbind.data.frame(results, c(this.m@model$run_time, i))
}
}
colnames(results) = c("runtime", "group")
results$group = factor(results$group)
results
## runtime group
## 1 29025 1
## 2 27751 1
## 3 23261 1
## 4 29131 1
## 5 23239 1
## 6 22929 1
## 7 29041 1
## 8 29242 1
## 9 29238 1
## 10 28000 1
## 11 27220 2
## 12 27666 2
## 13 26725 2
## 14 27557 2
## 15 24276 2
## 16 27772 2
## 17 25363 2
## 18 25649 2
## 19 27975 2
## 20 27893 2
## 21 24456 3
## 22 24289 3
## 23 25551 3
## 24 24455 3
## 25 25500 3
## 26 25783 3
## 27 24531 3
## 28 25720 3
## 29 23769 3
## 30 25018 3
## 31 26066 4
## 32 26638 4
## 33 26413 4
## 34 26009 4
## 35 27081 4
## 36 26055 4
## 37 26977 4
## 38 27296 4
## 39 25959 4
## 40 26936 4
## 41 26871 5
## 42 26828 5
## 43 26999 5
## 44 27045 5
## 45 27048 5
## 46 24631 5
## 47 26952 5
## 48 26992 5
## 49 27196 5
## 50 26999 5
reduced_means = aggregate(x=results$runtime,by= list(results$group), FUN = mean)
reduced_sd = aggregate(x=results$runtime,by= list(results$group), FUN = sd)
reduced_results = cbind.data.frame(reduced_means$x, reduced_sd$x)
colnames(reduced_results) = c("mean", "sd")
x = rownames(reduced_results)
y = reduced_results[,1]
sd = reduced_results[,2]
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Runtimes of models with different number of features (1-5)", x="number of features used", y="runtime(msecs)")
selected_aov = aov(runtime ~ factor(group), data = results)
selected_aov
## Call:
## aov(formula = runtime ~ factor(group), data = results)
##
## Terms:
## factor(group) Residuals
## Sum of Squares 30116002 95674823
## Deg. of Freedom 4 45
##
## Residual standard error: 1458.118
## Estimated effects may be unbalanced
summary(selected_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(group) 4 30116002 7529000 3.541 0.0135 *
## Residuals 45 95674823 2126107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
full_vs_reduce = results
full_vs_reduce$group = as.character(full_vs_reduce$group )
for (m in h2o.cross_validation_models(model)){
full_vs_reduce= rbind.data.frame(full_vs_reduce, c(m@model$run_time, "all"))
}
full_vs_reduce$runtime= as.numeric(full_vs_reduce$runtime )
all_means = aggregate(x=full_vs_reduce$runtime,by= list(full_vs_reduce$group), FUN = mean)
all_sd = aggregate(x=full_vs_reduce$runtime,by= list(full_vs_reduce$group), FUN = sd)
all_results = cbind.data.frame(all_means$x, all_sd$x)
colnames(all_results) = c("mean", "sd")
rownames(all_results) = all_means$Group.1
x = rownames( all_results)
y = all_results[,1]
sd = all_results[,2]
qplot(x,y)+labs(title = "Runtimes of models with different numbers of features", x="number of features used", y="runtime (msecs)")+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)
## aov
full_aov = aov(runtime ~ factor(group), data = full_vs_reduce)
full_aov
## Call:
## aov(formula = runtime ~ factor(group), data = full_vs_reduce)
##
## Terms:
## factor(group) Residuals
## Sum of Squares 182478434 156762975
## Deg. of Freedom 5 54
##
## Residual standard error: 1703.825
## Estimated effects may be unbalanced
summary(full_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(group) 5 182478434 36495687 12.57 4.14e-08 ***
## Residuals 54 156762975 2903018
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
object.size(model)
## 224296 bytes
temp = c()
for (m in h2o.cross_validation_models(model)){
temp = c(temp, object.size(m))
}
print (c(mean(temp),sd(temp)))
## [1] 200574.4000 212.9597
results = data.frame()
for (i in 1:length(model_list)){
m = model_list[[i]]
for (this.m in h2o.cross_validation_models(m)){
results = rbind.data.frame(results, c(object.size(this.m), i))
}
}
colnames(results) = c("size", "group")
results$group = factor(results$group)
results
## size group
## 1 50744 1
## 2 50816 1
## 3 50680 1
## 4 50760 1
## 5 50544 1
## 6 50688 1
## 7 50616 1
## 8 50616 1
## 9 50616 1
## 10 50616 1
## 11 50936 2
## 12 51080 2
## 13 51184 2
## 14 51192 2
## 15 51064 2
## 16 50944 2
## 17 51048 2
## 18 50936 2
## 19 50936 2
## 20 50880 2
## 21 51216 3
## 22 51224 3
## 23 51288 3
## 24 51224 3
## 25 51296 3
## 26 51352 3
## 27 51208 3
## 28 51288 3
## 29 51160 3
## 30 51160 3
## 31 51648 4
## 32 51544 4
## 33 51632 4
## 34 51584 4
## 35 51544 4
## 36 51656 4
## 37 51528 4
## 38 51536 4
## 39 51576 4
## 40 51536 4
## 41 51984 5
## 42 51768 5
## 43 51704 5
## 44 51992 5
## 45 51712 5
## 46 51808 5
## 47 51752 5
## 48 51984 5
## 49 51872 5
## 50 51928 5
reduced_means = aggregate(x=results$size,by= list(results$group), FUN = mean)
reduced_sd = aggregate(x=results$size,by= list(results$group), FUN = sd)
reduced_results = cbind.data.frame(reduced_means$x, reduced_sd$x)
colnames(reduced_results) = c("mean", "sd")
x = rownames(reduced_results)
y = reduced_results[,1]
sd = reduced_results[,2]
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Model size of models with different number of features (1-5)", x="number of features used", y="bytes")+scale_y_continuous(limits = c(min(results$size), max(results$size)))
selected_aov = aov(size ~ factor(group), data = results)
selected_aov
## Call:
## aov(formula = size ~ factor(group), data = results)
##
## Terms:
## factor(group) Residuals
## Sum of Squares 8557414 350746
## Deg. of Freedom 4 45
##
## Residual standard error: 88.2856
## Estimated effects may be unbalanced
summary(selected_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(group) 4 8557414 2139354 274.5 <2e-16 ***
## Residuals 45 350746 7794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
full_vs_reduce = results
full_vs_reduce$group = as.character(full_vs_reduce$group )
for (m in h2o.cross_validation_models(model)){
full_vs_reduce= rbind.data.frame(full_vs_reduce, c(object.size(m), "all"))
}
full_vs_reduce$size= as.numeric(full_vs_reduce$size )
all_means = aggregate(x=full_vs_reduce$size,by= list(full_vs_reduce$group), FUN = mean)
all_sd = aggregate(x=full_vs_reduce$size,by= list(full_vs_reduce$group), FUN = sd)
all_results = cbind.data.frame(all_means$x, all_sd$x)
colnames(all_results) = c("mean", "sd")
rownames(all_results) = all_means$Group.1
x = rownames( all_results)
y = all_results[,1]
sd = all_results[,2]
qplot(x,y)+geom_errorbar(aes(x=x, ymin=y-sd, ymax=y+sd), width=0.25)+labs(title = "Model size of models with different numbers of features", x="number of features used", y="bytes")+scale_y_continuous(limits = c(min(full_vs_reduce$size), max(full_vs_reduce$size)))
## aov
full_aov = aov(size ~ factor(group), data = full_vs_reduce)
full_aov
## Call:
## aov(formula = size ~ factor(group), data = full_vs_reduce)
##
## Terms:
## factor(group) Residuals
## Sum of Squares 185768612796 758912
## Deg. of Freedom 5 54
##
## Residual standard error: 118.5493
## Estimated effects may be unbalanced
summary(full_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(group) 5 1.858e+11 3.715e+10 2643654 <2e-16 ***
## Residuals 54 7.589e+05 1.405e+04
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
h2o.saveModel(model,paste(getwd(), deparse(substitute(model)),sep="_"))
for (m in model_list){
h2o.saveModel(m,paste(getwd(),deparse(substitute(m)), sep="_"))
}
h2o.saveModel(model.test,paste(getwd(),"model.test", sep="_"))
for (m in model_list_var){
h2o.saveModel(m,paste(getwd(),deparse(substitute(m)), sep="_"))
}
for (m in model_list_abs_cv){
h2o.saveModel(m,paste(getwd(),deparse(substitute(m)), sep="_"))
}
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: